In [1]:
from anndata import AnnData
import scrublet as scr
import pandas as pd
import scanpy as sc
import numpy as np
import rbcde
import scipy
import os

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-N2-clean-manifold/'
sc.logging.print_versions()
/usr/local/lib/python3.6/dist-packages/dask/config.py:161: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
  data = yaml.load(f.read()) or {}
scanpy==1.4.5.post2 anndata==0.6.22.post1 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==0.25.3 scikit-learn==0.22.1 statsmodels==0.11.0rc2 python-igraph==0.7.1 louvain==0.6.1
In [2]:
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

Import the object from the prior notebook and kick out contaminants - trypsin populations outside clusters 5, 12 and 13. Regenerate raw form of data.

In [3]:
adata = sc.read('endometrium-N1-integration.h5ad')
mask = np.array([i in ['5','12','13'] for i in adata.obs['leiden']])
adata = adata[mask | (~mask & np.array(adata.obs['treatment'] == 'C'))]

adata = AnnData(adata.raw.X, var=adata.raw.var, obs=adata.obs)

Store the raw form in .raw. Filter out unexpressed genes, recompute mitochondrial percentage, then obtain log(CPM/100 + 1) form of data and store that in .X.

In [4]:
adata.raw = adata.copy()
sc.pp.filter_genes(adata, min_cells=3)
#convert to lower to be species agnostic: human mito start with MT-, mouse with mt-
mito_genes = [name for name in adata.var_names if name.lower().startswith('mt-')]
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
filtered out 7449 genes that are detected in less than 3 cells
normalizing by total count per cell
    finished (0:00:06): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)

Remove cell cycle genes by inverting the object and clustering the gene space, reporting back the cluster with known cycling genes as members.

In [5]:
bdata = adata.copy()
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
bdata = bdata[:, bdata.var['highly_variable']]
bdata = bdata.copy().T
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack')
sc.pl.pca_variance_ratio(bdata, log=True, save='_ccg_identification.pdf')
extracting highly variable genes
    finished (0:00:05)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
computing PCA with n_comps = 50
    finished (0:00:04)
WARNING: saving figure to file figures-N2-clean-manifold/pca_variance_ratio_ccg_identification.pdf
In [6]:
num_pcs = 20

sc.pp.neighbors(bdata,n_pcs=num_pcs)
sc.tl.umap(bdata)
sc.tl.leiden(bdata, resolution=0.4)
bdata.obs['known_cyclers'] = [i in ['CDK1','MKI67','CCNB2','PCNA'] for i in bdata.obs_names]
sc.pl.umap(bdata,color=['leiden','known_cyclers'],color_map='OrRd',save='_ccg_identification.pdf')

print(bdata.obs.loc[['CDK1','MKI67','CCNB2','PCNA'],'leiden'])
computing neighbors
    using 'X_pca' with n_pcs = 20
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:07)
running Leiden clustering
    finished: found 10 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
WARNING: saving figure to file figures-N2-clean-manifold/umap_ccg_identification.pdf
index
CDK1       9
MKI67      9
CCNB2      9
PCNA     NaN
Name: leiden, dtype: category
Categories (10, object): [0, 1, 2, 3, ..., 6, 7, 8, 9]
/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py:961: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]

With the cycling genes identified, clean up the plotting folder by moving all related plots to a subfolder.

In [7]:
def MovePlots(plotpattern, subplotdir):
    if not os.path.exists(str(sc.settings.figdir)+subplotdir):
        os.makedirs(str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'*.* '+str(sc.settings.figdir)+'/'+subplotdir)

MovePlots('ccg_identification','ccg_identification')

Flag cell cycling genes and continue standard pipeline in a helper object until PCA.

In [8]:
adata.var['is_ccg'] = [i in bdata.obs[bdata.obs['leiden']=='9'].index for i in adata.var_names]
#compute HVG and PCA on separate helper object and port the results back to the original one
bdata = adata[:, ~adata.var['is_ccg']]
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
for col in ['highly_variable','means', 'dispersions', 'dispersions_norm']:
    adata.var[col] = bdata.var[col]
#fill NaNs with False so that subsetting to HVGs is possible
adata.var['highly_variable'].fillna(value=False, inplace=True)
bdata = bdata[:, bdata.var['highly_variable']]
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack', n_comps=50)
adata.obsm['X_pca'] = bdata.obsm['X_pca'].copy()
adata.uns['pca'] = bdata.uns['pca'].copy()
adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, 50))
adata.varm['PCs'][adata.var['highly_variable']] = bdata.varm['PCs']
sc.pl.pca_variance_ratio(adata, log=True, save='.pdf')
extracting highly variable genes
    finished (0:00:05)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
Trying to set attribute `.var` of view, making a copy.
/usr/local/lib/python3.6/dist-packages/scanpy/preprocessing/_simple.py:909: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
computing PCA with n_comps = 50
    on highly variable genes
    finished (0:00:09)
WARNING: saving figure to file figures-N2-clean-manifold/pca_variance_ratio.pdf

Correct cross-individual batch effects with Harmony.

In [9]:
num_pcs = 20

pca = adata.obsm['X_pca'][:, :num_pcs]
batch = adata.obs['individual']
In [10]:
%load_ext rpy2.ipython
In [11]:
%%R -i pca -i batch -o hem

library(harmony)
library(magrittr)

#this gets "90% of the way" to determinism. better than nothing.
set.seed(1)

hem = HarmonyMatrix(pca, batch, theta=1, verbose=FALSE, do_pca=FALSE)
hem = data.frame(hem)
R[write to console]: Loading required package: Rcpp

Compute clustering and UMAP, plot metadata, clustering and individual location UMAPs.

In [12]:
adata.obsm['X_harmony'] = hem.values
sc.pp.neighbors(adata, use_rep='X_harmony')
sc.tl.leiden(adata, resolution = 0.9)
sc.tl.umap(adata)

#need to set this up manually as the different metadata csvs are not fully compatible
plotmeta = ['individual','location','phase','clinical','day','type','treatment','sample']

sc.pl.umap(adata, color=plotmeta, save='.pdf')
sc.pl.umap(adata, color='leiden', save='_clustering.pdf')
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_clustering_clusnumbers.pdf')
computing neighbors
/usr/local/lib/python3.6/dist-packages/numba/typed_passes.py:293: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
/usr/local/lib/python3.6/dist-packages/umap/nndescent.py:92: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
/usr/local/lib/python3.6/dist-packages/numba/typed_passes.py:293: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:17)
running Leiden clustering
    finished: found 19 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:01:16)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:15)
WARNING: saving figure to file figures-N2-clean-manifold/umap.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap_clustering.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap_clustering_clusnumbers.pdf

Plot canonical markers for ease of annotation.

In [13]:
with open('markers.txt','r') as fid:
    markers = np.unique([line.rstrip() for line in fid.readlines()])

#make sure they're in the dataset, and sort them alphabetically for ease of finding things
markers = sorted([item for item in markers if item in adata.var_names])

for i in np.arange(np.ceil(len(markers)/4)):
    sc.pl.umap(adata, color=markers[int(i*4):int((i+1)*4)], use_raw=False, save='-markers'+str(int(i+1))+'.pdf', color_map='OrRd')

#goodbye clutter!
MovePlots('markers','markers')
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers1.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers2.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers3.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers4.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers5.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers6.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers7.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers8.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers9.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers10.pdf
WARNING: saving figure to file figures-N2-clean-manifold/umap-markers11.pdf

Identify cluster markers via the rank-biserial correlation, an effect size equivalent of the Wilcoxon test. Threshold the coefficient with the literature critical value for a large effect (0.5), and possibly lower the stringency if any clusters have fewer than five markers. Write out resulting marker list to CSV.

In [14]:
rbcde.RBC(adata, use_raw=False)

plot_dict = {}
degs = pd.DataFrame(columns=['cluster','RBC'])
for thresh in [0.5,0.3,0.2,0.1]:
    degs_temp, temp_dict = rbcde.filter_markers(adata, use_raw=False, thresh=thresh)
    for clus in temp_dict:
        if len(temp_dict[clus])>=5 and clus not in plot_dict:
            plot_dict[clus] = temp_dict[clus]
            degs = pd.concat([degs, degs_temp.loc[degs_temp['cluster']==clus,:]])
    if set(plot_dict.keys()) == set(temp_dict.keys()):
        break
degs.to_csv(str(sc.settings.figdir)+'/cluster_markers_rbc.csv')
computing rank-biserial correlation
	finished: added
	'RBC_' columns for each of the clusters to .var (0:02:09)
--> 40 markers found for cluster 0
--> 43 markers found for cluster 1
--> 32 markers found for cluster 10
--> 40 markers found for cluster 11
--> 32 markers found for cluster 12
--> 10 markers found for cluster 13
--> 108 markers found for cluster 14
--> 428 markers found for cluster 15
--> 55 markers found for cluster 16
--> 148 markers found for cluster 17
--> 294 markers found for cluster 18
--> 20 markers found for cluster 2
--> 76 markers found for cluster 3
--> 121 markers found for cluster 4
--> 98 markers found for cluster 5
--> 16 markers found for cluster 6
--> 16 markers found for cluster 7
--> 124 markers found for cluster 8
--> 1 markers found for cluster 9
--> 192 markers found for cluster 0
--> 129 markers found for cluster 1
--> 103 markers found for cluster 10
--> 153 markers found for cluster 11
--> 123 markers found for cluster 12
--> 27 markers found for cluster 13
--> 303 markers found for cluster 14
--> 1213 markers found for cluster 15
--> 212 markers found for cluster 16
--> 420 markers found for cluster 17
--> 888 markers found for cluster 18
--> 172 markers found for cluster 2
--> 203 markers found for cluster 3
--> 345 markers found for cluster 4
--> 461 markers found for cluster 5
--> 142 markers found for cluster 6
--> 83 markers found for cluster 7
--> 400 markers found for cluster 8
--> 16 markers found for cluster 9

Do dotplots (for top 100 in the interest of legibility, if need be) and more detailed exports.

In [15]:
adata_count = AnnData(np.expm1(adata.X), var=adata.var, obs=adata.obs)
for clus in plot_dict:
    degs_clus = degs.loc[degs['cluster']==clus,:].copy()
    degs_clus['log2_FC'] = 0
    degs_clus['percent_cluster'] = 0
    degs_clus['percent_rest'] = 0
    adata_clus = adata_count[adata.obs['leiden']==clus]
    adata_rest = adata_count[[not i for i in adata.obs['leiden']==clus]]
    adata_clus = adata_clus[:, degs_clus.index]
    adata_rest = adata_rest[:, degs_clus.index]
    degs_clus['log2_FC'] = np.asarray(np.log2(np.mean(adata_clus.X,axis=0)/np.mean(adata_rest.X,axis=0))).reshape(-1)
    #are you expressed?
    adata_clus.X.data = adata_clus.X.data > 0
    adata_rest.X.data = adata_rest.X.data > 0
    degs_clus['percent_cluster'] = np.asarray(100*np.sum(adata_clus.X,axis=0)/adata_clus.shape[0]).reshape(-1)
    degs_clus['percent_rest'] = np.asarray(100*np.sum(adata_rest.X,axis=0)/adata_rest.shape[0]).reshape(-1)
    degs_clus.to_csv(str(sc.settings.figdir)+'/cluster_markers_'+clus+'.csv')
    sc.pl.dotplot(adata, {clus: plot_dict[clus][:100]}, groupby='leiden', use_raw=False, save='_cluster_markers_'+clus+'.pdf')

#goodbye clutter!
MovePlots('cluster_markers','cluster_markers')
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_0.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_1.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_10.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_11.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_12.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_13.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_14.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_15.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_16.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_17.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_18.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_2.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_3.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_4.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_5.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_6.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_7.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_8.pdf
WARNING: saving figure to file figures-N2-clean-manifold/dotplot_cluster_markers_9.pdf

And with that, think we're good. Save the object for later.

In [16]:
adata.write('endometrium-N2-clean-manifold.h5ad')